library(tidyverse)
library(glmnet)
library(caTools)
library(caret)
library(ROCR)
coded = read.csv("~/Documents/code/dsnlab/automotion-test-set/tds_artifact_coded_volumes.csv")
fileDir = "~/Documents/code/dsnlab/automotion-test-set/output/"
filePattern = "tds_stripes_.*.csv"
file_list = list.files(fileDir, pattern = filePattern)
for (file in file_list){
# if the merged dataset doesn't exist, create it
if (!exists("stripes")){
temp = read.csv(paste0(fileDir,file))
stripes = data.frame(temp) %>%
rename("volume" = t,
"subjectID" = subject) %>%
select(-file)
rm(temp)
}
# if the merged dataset does exist, append to it
else {
temp_dataset = read.csv(paste0(fileDir,file))
temp_dataset = data.frame(temp_dataset) %>%
rename("volume" = t,
"subjectID" = subject) %>%
select(-file)
stripes = rbind(stripes, temp_dataset)
rm(temp_dataset)
}
}
# define paths and variables
rpDir = '~/Documents/code/dsnlab/automotion-test-set/rp_txt/'
outputDir = '~/Documents/code/tds_auto-motion/auto-motion-output/'
plotDir = '~/Documents/code/tds_auto-motion/auto-motion-output/plots/'
study = "tds2"
rpPattern = "^rp_([0-9]{3})_(.*).txt"
rpCols = c("euclidian_trans","euclidian_rot","euclidian_trans_deriv","euclidian_rot_deriv","trash.rp")
# global intensities
intensities = read.csv(paste0(outputDir,study,'_globalIntensities.csv'))
# edit volume numbers for subject 157, stop3
intensities = intensities %>%
mutate(volume = ifelse(subjectID == 157 & run == "stop3" & volume > 43, volume - 1, volume))
# rp files
file_list = list.files(rpDir, pattern = rpPattern)
for (file in file_list){
# if the merged dataset doesn't exist, create it
if (!exists("rp")){
temp = read.table(paste0(rpDir,file))
colnames(temp) = rpCols
rp = data.frame(temp, file = rep(file,count(temp))) %>%
mutate(volume = row_number()) %>%
extract(file,c("subjectID","run"), rpPattern) %>%
mutate(subjectID = as.integer(subjectID))
rm(temp)
}
# if the merged dataset does exist, append to it
else {
temp_dataset = read.table(paste0(rpDir,file))
colnames(temp_dataset) = rpCols
temp_dataset = data.frame(temp_dataset, file = rep(file,count(temp_dataset))) %>%
mutate(volume = row_number()) %>%
extract(file,c("subjectID","run"), rpPattern) %>%
mutate(subjectID = as.integer(subjectID))
rp = rbind(rp, temp_dataset)
rm(temp_dataset)
}
}
joined = left_join(stripes, coded, by = c("subjectID", "run", "volume")) %>%
left_join(., intensities, by = c("subjectID", "run", "volume")) %>%
left_join(., rp, by = c("subjectID", "run", "volume")) %>%
mutate(striping = ifelse(is.na(striping), 0, striping),
intensity = ifelse(is.na(intensity), 0, intensity))
# tidy data
data = joined %>%
mutate(tile = paste0("tile_",tile),
artifact = ifelse(striping > 1 | intensity > 1, 1, 0)) %>%
spread(tile, freqtile_power) %>%
select(-striping, - intensity, -euclidian_rot_deriv, -trash.rp, -fsl.volume) %>%
select(subjectID, run, volume, euclidian_trans_deriv, artifact, everything())
# create training and test sets
set.seed(101)
sample = sample.split(data$artifact, SplitRatio = .75)
train = subset(data, sample == TRUE)
test = subset(data, sample == FALSE)
# subset predictors and criterion
x_train = as.matrix(train[,-c(1,2,3,4,5)])
y_train = as.double(as.matrix(train[, 5]))
# run xval to determine lambda
cv.train <- cv.glmnet(x_train, y_train, family='binomial', alpha=1, parallel=TRUE, standardize=TRUE, type.measure='auc')
plot(cv.train)
plot(cv.train$glmnet.fit, xvar="lambda", label=TRUE)
cv.train$lambda.min
## [1] 0.0003247472
cv.train$lambda.1se
## [1] 0.001194543
coef(cv.train, s=cv.train$lambda.min)
## 16 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.361452e+00
## volMean .
## volSD -3.745048e-04
## euclidian_trans 1.042512e+00
## euclidian_rot 5.922844e-01
## tile_1 -3.392376e-01
## tile_10 1.253834e+04
## tile_11 -7.033361e+03
## tile_2 2.819671e+01
## tile_3 .
## tile_4 .
## tile_5 2.080045e+02
## tile_6 -1.345085e+03
## tile_7 .
## tile_8 2.824277e+03
## tile_9 -7.034841e+03
coef(cv.train, s=cv.train$lambda.1se)
## 16 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.754184e+00
## volMean .
## volSD .
## euclidian_trans 1.098392e+00
## euclidian_rot 4.784833e-01
## tile_1 -1.991706e-02
## tile_10 9.822724e+03
## tile_11 -4.871957e+03
## tile_2 .
## tile_3 .
## tile_4 -1.491836e+01
## tile_5 .
## tile_6 -4.916124e+02
## tile_7 .
## tile_8 2.018037e+01
## tile_9 -3.738765e+03
# test on sample
pred_train = predict(cv.train, newx = x_train, s=cv.train$lambda.1se, type="response")
# plot cutoff v. accuracy
predicted = prediction(pred_train, y_train, label.ordering = NULL)
perf = performance(predicted, measure = "acc")
perf.df = data.frame(cut=perf@x.values[[1]],acc=perf@y.values[[1]])
ggplot(perf.df, aes(cut, acc)) +
geom_line()
# plot false v. true positive rate
perf = performance(predicted, measure = "tpr", x.measure = "fpr")
perf.df = data.frame(cut=perf@alpha.values[[1]],fpr=perf@x.values[[1]],tpr=perf@y.values[[1]])
ggplot(perf.df, aes(fpr, tpr)) +
geom_line()
# plot specificity v. sensitivity
perf = performance(predicted, measure = "sens", x.measure = "spec")
perf.df = data.frame(cut=perf@alpha.values[[1]],sens=perf@x.values[[1]],spec=perf@y.values[[1]])
ggplot(perf.df, aes(spec, sens)) +
geom_line()
ggplot(perf.df, aes(x = cut)) +
geom_line(aes(y = sens, color = "sensitivity")) +
geom_line(aes(y = spec, color = "specificity"))
cut = perf@alpha.values[[1]][which.max(perf@x.values[[1]]+perf@y.values[[1]])]
ss = max(perf@x.values[[1]]+perf@y.values[[1]]) # sensitivity + specificity
# confusion matrix
pred_train = predict(cv.train, newx = x_train, s=cv.train$lambda.1se, type="response")
pred_train[pred_train > .2] = 1
pred_train[pred_train < .2] = 0
confusionMatrix(pred_train, y_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5613 66
## 1 27 70
##
## Accuracy : 0.9839
## 95% CI : (0.9803, 0.987)
## No Information Rate : 0.9765
## P-Value [Acc > NIR] : 4.998e-05
##
## Kappa : 0.5929
## Mcnemar's Test P-Value : 8.134e-05
##
## Sensitivity : 0.9952
## Specificity : 0.5147
## Pos Pred Value : 0.9884
## Neg Pred Value : 0.7216
## Prevalence : 0.9765
## Detection Rate : 0.9718
## Detection Prevalence : 0.9832
## Balanced Accuracy : 0.7550
##
## 'Positive' Class : 0
##
######### test on holdout sample
# subset predictors and criterion
x_test = as.matrix(test[,-c(1,2,3,4,5)])
y_test = as.double(as.matrix(test[, 5]))
# test on holdout sample
pred_test = predict(cv.train, newx = x_test, s=cv.train$lambda.1se, type="response")
pred_test[pred_test > .2] = 1
pred_test[pred_test < .2] = 0
# confusion matrix
confusionMatrix(pred_test, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1876 23
## 1 4 23
##
## Accuracy : 0.986
## 95% CI : (0.9797, 0.9907)
## No Information Rate : 0.9761
## P-Value [Acc > NIR] : 0.001563
##
## Kappa : 0.6235
## Mcnemar's Test P-Value : 0.000532
##
## Sensitivity : 0.9979
## Specificity : 0.5000
## Pos Pred Value : 0.9879
## Neg Pred Value : 0.8519
## Prevalence : 0.9761
## Detection Rate : 0.9740
## Detection Prevalence : 0.9860
## Balanced Accuracy : 0.7489
##
## 'Positive' Class : 0
##
######### logistic regression
## note: couldn't get predict function to predict test values
# log = glm(artifact ~ volMean + volSD + euclidian_trans + euclidian_rot + tile_1 + tile_2 + tile_3 + tile_4 + tile_5 + tile_6 + tile_7 + tile_8 + tile_9 + tile_10 + tile_11, family='binomial', data=train)
#
# pred = predict(log, newx = train, type="response")
# predicted = prediction(pred, y_train, label.ordering = NULL)
# perf = performance(predicted, measure = "sens", x.measure = "spec")
# perf.df = data.frame(cut=perf@alpha.values[[1]],sens=perf@x.values[[1]],spec=perf@y.values[[1]])
# ggplot(perf.df, aes(spec, sens)) +
# geom_line()
#
# ggplot(perf.df, aes(x = cut)) +
# geom_line(aes(y = sens, color = "sensitivity")) +
# geom_line(aes(y = spec, color = "specificity")) +
# geom_vline(xintercept = .03)
#
# pred = predict(log, newx = x_test, type="response")
# pred[pred > .03] = 1
# pred[pred < .03] = 0
# confusionMatrix(pred, y_test)
comp = joined %>%
filter(tile == 1 | tile == 10) %>%
# code trash based on mean, sd, and rp
group_by(subjectID, run, tile) %>%
mutate(Diff.mean = volMean - lag(volMean),
Diff.sd = volSD - lag(volSD)) %>%
ungroup %>%
mutate(meanDiff.mean = mean(Diff.mean, na.rm=TRUE),
sdDiff.mean = sd(Diff.mean, na.rm=TRUE),
meanDiff.sd = mean(Diff.sd, na.rm=TRUE),
sdDiff.sd = sd(Diff.sd, na.rm=TRUE),
# code volumes above mean thresholds as trash
upper.mean = meanDiff.mean + 2*sdDiff.mean,
lower.mean = meanDiff.mean - 2*sdDiff.mean,
trash.mean = ifelse(Diff.mean > upper.mean | Diff.mean < lower.mean, 1, 0),
upper.sd = meanDiff.sd + 2*sdDiff.sd,
lower.sd = meanDiff.sd - 2*sdDiff.sd,
trash.sd = ifelse(Diff.sd > upper.sd | Diff.sd < lower.sd, 1, 0),
# code volumes with more than +/- .25mm translation or rotation in Euclidian distance
trash.rp.tr = ifelse(euclidian_trans_deriv > .25 | euclidian_trans_deriv < -.25, 1, trash.rp),
trash.rp.rot = ifelse(euclidian_rot_deriv > .25 | euclidian_rot_deriv < -.25, 1, trash.rp)) %>%
select(-meanDiff.mean, -meanDiff.sd, -sdDiff.mean, -sdDiff.sd) %>%
# code trash based on striping
group_by(subjectID, run, tile) %>%
mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
ungroup() %>%
select(-freqtile_power) %>%
spread(tile,freqtile_power_c) %>%
mutate(trash.stripe = ifelse(`1` < -.035 & `10` > .00025, 1, 0)) %>%
# combine trash
mutate(trash.combined = ifelse(trash.stripe == 1, 1, 0),
trash.combined = ifelse((trash.rp.tr + trash.rp.rot + trash.mean + trash.sd) > 1, 1, trash.combined)) %>%
#trash.combined = ifelse(lead(trash.combined) == TRUE, TRUE, trash.combined)) %>%
# recode as trash if volume behind and in front are both marked as trash
mutate(trash.combined = ifelse(trash.combined == 0 & lag(trash.combined) == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
# code first volume as trash if second volume is trash
mutate(trash.combined = ifelse(volume == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
# code hits
mutate(hits = ifelse(trash.combined == 1 & (striping == 2 | intensity == 2), "hit",
#ifelse(trash.combined == 1 & (striping == 1 | intensity == 1), "hit.light",
ifelse(trash.combined == 0 & (striping == 2 | intensity == 2), "neg",
#ifelse(trash.combined == 0 & (striping == 1 | intensity == 1), "neg.light",
ifelse(trash.combined == 1 & (striping == 0 | intensity == 0), "pos", NA))),
label = ifelse(regexpr('.*', hits), as.character(volume), ''),
hits = as.factor(hits)) %>%
gather(tile, freqtile_power_c, c(`1`, `10`)) %>%
mutate(tile = as.numeric(tile))
# select only one set of observations and code any artifact as 1
comp.tab = comp %>%
filter(tile == 1) %>%
mutate(artifact = ifelse(striping > 0 | intensity > 0, 1, 0),
hits.tot = ifelse(hits %in% c("hit", "hit.light"), "hit",
ifelse(hits %in% c("neg", "neg.light"), "neg",
ifelse(hits %in% c("pos"), "pos",
ifelse(is.na(hits), "cor.rej", NA)))))
# separate into training and test runs
set.seed(101)
comp.sample = sample.split(comp.tab$artifact, SplitRatio = .75)
comp.train = subset(comp.tab, sample == TRUE)
comp.test = subset(comp.tab, sample == FALSE)
# machine learning
confusionMatrix(pred_train, y_train)$table
## Reference
## Prediction 0 1
## 0 5613 66
## 1 27 70
confusionMatrix(pred_train, y_train)$overall[1]
## Accuracy
## 0.9838989
confusionMatrix(pred_train, y_train)$byClass[11]
## Balanced Accuracy
## 0.7549593
# manual
table(comp.train$hits.tot)
##
## cor.rej hit neg pos
## 5599 101 33 43
#table(comp.train$hits)
cor.rej = table(comp.train$hits.tot)[[1]]
hit = table(comp.train$hits.tot)[[2]]
neg = table(comp.train$hits.tot)[[3]]
pos = table(comp.train$hits.tot)[[4]]
total = cor.rej + hit + neg + pos
sprintf('Accuracy: %s', round((cor.rej + hit) / total,2))
## [1] "Accuracy: 0.99"
# balanced accuracy
sprintf('Balanced accuracy: %s', round(((cor.rej / (cor.rej + pos)) + (hit / (hit + neg))) / 2,2))
## [1] "Balanced accuracy: 0.87"
# machine learning
confusionMatrix(pred_test, y_test)$table
## Reference
## Prediction 0 1
## 0 1876 23
## 1 4 23
confusionMatrix(pred_test, y_test)$overall[1]
## Accuracy
## 0.9859813
confusionMatrix(pred_test, y_test)$byClass[11]
## Balanced Accuracy
## 0.7489362
# manual
table(comp.test$hits.tot)
##
## cor.rej hit neg pos
## 1868 33 12 13
#table(comp.test$hits)
cor.rej = table(comp.test$hits.tot)[[1]]
hit = table(comp.test$hits.tot)[[2]]
neg = table(comp.test$hits.tot)[[3]]
pos = table(comp.test$hits.tot)[[4]]
total = cor.rej + hit + neg + pos
sprintf('Accuracy: %s', round((cor.rej + hit) / total,2))
## [1] "Accuracy: 0.99"
# balanced accuracy
sprintf('Balanced accuracy: %s', round(((cor.rej / (cor.rej + pos)) + (hit / (hit + neg))) / 2,2))
## [1] "Balanced accuracy: 0.86"
thresholds = comp %>%
filter(tile == 1) %>%
select(subjectID, run) %>%
unique(.) %>%
mutate(upper = .25,
lower = -.25)
ggplot(filter(comp, tile == 1), aes(x = volume, y = euclidian_trans_deriv)) +
geom_line(size = .25) +
geom_point(data = subset(comp, !is.na(hits)), aes(color = hits), size = 2.5) +
geom_text(aes(label = label), size = 1.5) +
facet_wrap(~ subjectID + run, ncol = 4, scales = "free") +
scale_color_manual(values = c("#3B9AB2","#EBCC2A","#F21A00")) +
#scale_color_manual(values = c("#3B9AB2","#78B7C5","#9EBE91","#EBCC2A","#F21A00")) +
geom_hline(data = thresholds, aes(yintercept = upper), color = "#F21A00") +
geom_hline(data = thresholds, aes(yintercept = lower), color = "#F21A00") +
theme(axis.text.x = element_text(size = 6))
thresholds = comp %>%
filter(tile == 1) %>%
select(subjectID, run, upper.mean, lower.mean) %>%
unique(.) %>%
mutate(upper = upper.mean,
lower = lower.mean)
ggplot(filter(comp, tile == 1), aes(x = volume, y = Diff.mean)) +
geom_line(size = .25) +
geom_point(data = subset(comp, !is.na(hits)), aes(color = hits), size = 2.5) +
geom_text(aes(label = label), size = 1.5) +
facet_wrap(~ subjectID + run, ncol = 4, scales = "free") +
scale_color_manual(values = c("#3B9AB2","#EBCC2A","#F21A00")) +
#scale_color_manual(values = c("#3B9AB2","#78B7C5","#9EBE91","#EBCC2A","#F21A00")) +
geom_hline(data = thresholds, aes(yintercept = upper), color = "#F21A00") +
geom_hline(data = thresholds, aes(yintercept = lower), color = "#F21A00") +
theme(axis.text.x = element_text(size = 6))